In [72]:
import numpy as np
import shutil
import os
import json
import glob
from itertools import count
from scipy.sparse import coo_matrix, csr_matrix, identity, tril, triu, dia_matrix
from scipy.sparse.linalg import eigsh
import random
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans2
from math import sqrt
In [53]:
def get_all_files(basedir, ext):
'''
Arguments
basedir: string
Path to extracted dataset
ext: string
File extension (json)
Returns
files: ndarray
filenames: dict
Keys as filenames (without extension) and value as root path to the corresponding file
'''
filenames = {}
files = np.array([])
for root, _, fnames in os.walk(basedir):
files = np.concatenate((files, glob.glob(os.path.join(root,'*'+ext))))
for fn in fnames:
filenames[os.path.splitext(fn)[0]] = root
return (files, filenames)
In [54]:
def get_adjacency_matrix(files, t):
'''
Construct the weighted symmetric adjacency matrix in COOrdinate format
Arguments
files: ndarray which contains filepaths
t: int
threshold for similarity strength
Returns
M: sparse.coo_matrix
Sparse weighting symmetric adjacency matrix
'''
data = []
song_to_idx = {}
song_count = count()
rc2v = {}
for file in files: # parse each file in files
with open(file, 'r') as f:
song = json.load(f) # get song of each file
tid = song['track_id']
if not tid in song_to_idx:
song_to_idx[tid] = next(song_count) # assign unique id to each song
for s in song['similars']:
sid = s[0]
similarity_score = s[1]
if not sid in song_to_idx:
song_to_idx[sid] = next(song_count) # assign unique id to each song
if(s[1] > t): # add edge only if the similarity score stricly greater than threshold t
row = song_to_idx[tid] # store valid entry in our sparse matrix at position (row,col)
col = song_to_idx[sid]
if not (row,col) in rc2v: # maintain similarity scores of edges of the form (u,v) and (v,u)
rc2v[(row,col)] = similarity_score
if not (col,row) in rc2v:
rc2v[(col,row)] = similarity_score
rc2v[(row,col)] = rc2v[(col,row)] = max(rc2v[(row,col)], rc2v[(col,row)]) # matrix of an undirected graph, thus take max of the bidirectional edges
rc2v_mat = np.array([[v,k[0],k[1]] for k,v in rc2v.items()])
W = coo_matrix((rc2v_mat[:,0], (rc2v_mat[:,1], rc2v_mat[:,2])), shape=(len(song_to_idx), len(song_to_idx))) # ensure that matrix is square
return (W, song_to_idx)
In [55]:
def get_degree_matrix(M):
'''
Computes out degree matrix from an adjacency matrix
Arguments
M: sparse.xxx_matrix (where xxx can be coo, csr, csc, ...)
Adjacency matrix
Returns
D: sparse.dia_matrix
Out degree matrix of M
'''
degrees = M.getnnz(axis=1) # non-zero entries on each row equal the out-degree value of the nodes on each row
D = dia_matrix((M.shape[0], M.shape[1])) # initialize sparse diagonal matrix of same size as M
D.setdiag(degrees) # set diagonal entries to be equal to the out-degrees of each node
return D
In [56]:
def get_laplacian_matrix(D, W, normalized):
'''
Compute Laplacian matrix
Arguments
D: sparse.dia_matrix
Out degree matrix
W: sparse.xxx_matrix (where xxx can be coo, csr, csc, ...)
Sparse weighting symmetric adjacency matrix
Returns
L: sparse.matrix
un-/normalized laplacian matrix
'''
L = D - W # un-normalized laplacian matrix
if normalized == True:
D = D.tocsr() # for slicing
D = D.sqrt() # square root of a diagonal matrix corresponds to taking the square root of each diagonal element
D[D.nonzero()[0], D.nonzero()[1]] = 1.0/D[D.nonzero()[0], D.nonzero()[1]] # avoid division by zero
D = D.todia() # transform back to diagonal format
L_sym = D*L*D # compute normalized laplacian
return L_sym
else:
return L
In [57]:
def is_symmetric(M):
'''
Check whether a matrix is symmetric
Arguments
M: sparse.matrix
Returns
boolean:
'''
triu = triu(M) # upper triangular matrix of M
tril = tril(M) # lower triangular matrix of M
triu_nnz = triu.nonzero() # indices of nonzero values of upper triangular matrix of M
tril_nnz = tril.nonzero() # indices of nonzero values of lower triangular matrix of M
if any(triu_nnz[0] == tril_nnz[1]) and any(triu_nnz[1] == tril_nnz[0]):
return True
else:
return False
In [58]:
def spectral_shift(M):
'''
Perform spectral shifting for efficient computation of 'SM' eigens
Arguments
M: sparse.matrix
Returns
B: sparse.matrix
Spectral shifting of matrix M
'''
w_max, _ = eigsh(M, k=1) # get largest eigenvalue of matrix
I = identity(M.shape[0]) # initialize sparse identity matrix
B = M - w_max[0]*I # perform spectral shifting of matrix
return B
In [59]:
def power_iteration(M, iter=1000):
'''
Computes eigenvector of M with largest absolute value
Arguments
M: sparse.matrix
iter: int
stopping criterion of the algorithm
Returns
v: ndarray
Eigenvector of M with largest absolute value
'''
v = np.random.normal(size=(M.shape[0],1)) # initialize random vector
norm = np.linalg.norm(v, axis=0, ord=2) # take the norm
v = v/norm # normalize
for i in range(iter):
temp = (M*v) # store M*v to reduce computations
v = temp/np.linalg.norm(temp, axis=0, ord=2)
return v
In [60]:
def get_eigens(M, k, spectral, which, tol):
'''
Compute eigens of a sparse matrix
Arguments
M: sparse.matrix
k: int
number of eigenvalues
spectral: boolean
Whether to compute the eigenvalues of the spectral shifted matrix
which: string
See sparse.linalg.eigsh docs
tol: double
See sparse.linalg.eigsh docs
Returns
w: ndarray
k eigenvalues
v: ndarray
k eigenvectors
'''
if spectral == True:
B = spectral_shift(M) # perform spectral shift for efficient k smallest eigenvalue computation
w, v = eigsh(B, k, which=which, tol=tol)
return w, v
else:
w, v = eigsh(M, k, which=which, tol=tol)
return w, v
In [61]:
def cluster(data, k):
'''
Performs standard k-Means clustering algorithm
Arguments
data: ndarray
A ‘M’ by ‘N’ array of ‘M’ observations in ‘N’ dimensions or a
length ‘M’ array of ‘M’ one-dimensional observations.
k: int
The number of clusters to form as well as the number of centroids to generate.
Returns
cent: ndarray
k centroids
var: ndarray
labels of data assigned to k centroids
'''
cent, var = kmeans2(data, k)
return (cent, var)
In [62]:
def spectral_clustering(eigens, k):
'''
Performs spectral clustering on data
Arguments
data: ndarray
filepaths to instances
k: int
number of clusters
Returns
var: ndarray
assigned labels to instances
'''
var = cluster(eigens, k)[1]
return var
In [63]:
def plot_data(data, k):
'''
Plots histograms of data into k bins
Arguments
data: ndarray
k: int
number of bins
'''
hist, bins = np.histogram(data, bins=k, normed=False)
width = 0.2 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.show()
return hist
From Lecture we know that minimizing ratio cut for k > 2 is equivalent to solving following minimization problem
$\min{C_1, ..., C_k} [(RatioCut(C_1, ..., C_k)] = \min{C_1, ..., C_k} [trace(H^T L H)]$ subject to $H^T H = Id$
For the constrained relaxation version of the problem we consider arbitrary values for $H$, thus
$\min{H \in R^{Vxk}} [trace(H^T L H)]$ subject to $H^T H = Id$
This is a standard form of a trace minimization problem, for which we know from lecture that the solution is given by choosing $H$ as the matrix which contains the first $k$ eigenvectors of $L$ as columns.
In [82]:
def soft_min_ratio_cut(H):
'''
Computes constrained relaxation min cut ratio, i.e. the indicator matrix can have any values
Arguments
H: ndarray (or sparse matrix)
Indicator matrix
Returns
mrc: float
Minimum Ratio Cut
'''
H = csr_matrix(H)
mrc = (H.transpose()*L*H).diagonal().sum()
return mrc
In [83]:
def min_ratio_cut(labels, L, k):
'''
Calculate min ratio-cut to graph given by data. It can be normalied or un-normalized.
Arguments
labels: ndarray
Labels of instances computed e.g. by k-Means
k: int
number of clusters
Returns
mrc: float
Minimum Ratio Cut
'''
H = []
for i in range(k):
indices = np.argwhere(labels == i)
h = np.zeros_like(labels, dtype=np.float64)
h[indices[:,0]] = 1.0/sqrt(indices.shape[0])
H.append(h)
H = np.asarray(H)
H = csr_matrix(H)
mrc = (H*L*H.transpose()).diagonal().sum()
return mrc
In [84]:
def get_songs_to_tags_matrix(files, song_to_row, g):
'''
Construct the songs-to-tag matrix in COOordinate format
Arguments
tags_to_col:
dict where key is tag and value is column index from matrix N
g:
threshold for tag strength
Return
N:
songs-to-tags matrix
'''
row = []
col = []
data = []
tags_to_col = {}
col_count = count()
for file in files:
with open(file, 'r') as f:
song = json.load(f)
tid = song['track_id']
for t in song['tags']:
tag = t[0]
cnt = t[1]
if not tag in tags_to_col:
tags_to_col[tag] = next(col_count)
if(int(cnt) >= g):
row.append(song_to_row[tid])
col.append(tags_to_col[tag])
data.append(1)
N = coo_matrix((data, (row, col)), shape=(len(song_to_row), len(tags_to_col)))
del row, col, data, col_count
return (N, tags_to_col)
In [67]:
# TODO Implement k-figure with k plots
def plot_tags_per_cluster(labels, s2t, k):
for i in range(k):
class_ids = np.argwhere(labels == i)
s2t = s2t.tocsr()
tag_dist = N[cluster_1[:,0],:].sum(axis=0)
plot_data(tag_dist, N.shape[1])
In [68]:
basedir = 'lastfm_subset'
ext = '.json'
t = 0
normalized = False
k = 10
spectral = True
which = 'LM'
tol = 10e-2
g = 50
In [69]:
files = get_all_files(basedir, ext)[0] # get all filepaths in the specified dataset directory (='lastfm_subset')
In [70]:
W, song_indices = get_adjacency_matrix(files, t) # get weighted symmetric adjacency matrix and song indices
In [73]:
D = get_degree_matrix(W) # get out-degree matrix
In [74]:
L = get_laplacian_matrix(D, W, normalized) # get laplacian matrix
In [78]:
w, v = get_eigens(L, k, spectral, which, tol) # get k smalles eigenvalues and corresponding eigenvectors
In [79]:
labels = spectral_clustering(v, k) # get spectral clustering of data graph
In [80]:
plot_data(labels, k) # plot distribution of labels to k clusters
Out[80]:
In [27]:
soft_mrc = soft_min_ratio_cut(v) # perform soft min ratio-cut on data graph with k clusters
In [28]:
mrc = min_ratio_cut(labels, L, k) # perform min ratio-cut on data graph with k clusters
In [29]:
s2t, t2c = get_songs_to_tags_matrix(files, song_indices, g) # compute song_to_tags matrix and tags_to_column_indices
In [ ]:
plot_tags_per_cluster()